For my first project, I have selected the “LA International Airport Passenger Traffic by terminal” dataset from Data.gov. This is a simple dataset with 6 variables and 5870 observations. I have selected this dataset to see if I can know the busiest terminal with respect to the total number of passengers.
This dataset has the following 6 columns:
DataExtractDate
ReportPeriod
Terminal
Arrival_Departure
Domestic_International
Passenger_Count
I will be using the following library’s to load, view and clean data:
I have already installed the above three library’s, so I will start with loading them.
Loading the library’s…
knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
library(tidyverse)
Load the source file from the working directory, get the working directory and ensure the source files is located in this path…
getwd()
Load the file in R…
LA_Airport_Source <- read_csv("Los_Angeles_International_Airport_-_Passenger_Traffic_By_Terminal.csv")
## Parsed with column specification:
## cols(
## DataExtractDate = col_character(),
## ReportPeriod = col_character(),
## Terminal = col_character(),
## Arrival_Departure = col_character(),
## Domestic_International = col_character(),
## Passenger_Count = col_double()
## )
View the source file contents by using the newly created variable “LA_Airport_Data”.
LA_Airport_Source
## # A tibble: 5,870 x 6
## DataExtractDate ReportPeriod Terminal Arrival_Departu… Domestic_Intern…
## <chr> <chr> <chr> <chr> <chr>
## 1 05/01/2014 12:… 01/01/2006 … Imperia… Arrival Domestic
## 2 05/01/2014 12:… 01/01/2006 … Imperia… Departure Domestic
## 3 05/01/2014 12:… 01/01/2006 … Misc. T… Arrival Domestic
## 4 05/01/2014 12:… 01/01/2006 … Misc. T… Departure Domestic
## 5 05/01/2014 12:… 01/01/2006 … Termina… Arrival Domestic
## 6 05/01/2014 12:… 01/01/2006 … Termina… Departure Domestic
## 7 05/01/2014 12:… 01/01/2006 … Termina… Departure International
## 8 05/01/2014 12:… 01/01/2006 … Termina… Arrival Domestic
## 9 05/01/2014 12:… 01/01/2006 … Termina… Arrival International
## 10 05/01/2014 12:… 01/01/2006 … Termina… Departure Domestic
## # … with 5,860 more rows, and 1 more variable: Passenger_Count <dbl>
Know the class of “LA_Airport_Source”
class(LA_Airport_Source)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
View the source file contents as a tibble using the tbl_df. I wanted to see if there is any difference in the way the data is displayed.
tbl_df(LA_Airport_Source)
## # A tibble: 5,870 x 6
## DataExtractDate ReportPeriod Terminal Arrival_Departu… Domestic_Intern…
## <chr> <chr> <chr> <chr> <chr>
## 1 05/01/2014 12:… 01/01/2006 … Imperia… Arrival Domestic
## 2 05/01/2014 12:… 01/01/2006 … Imperia… Departure Domestic
## 3 05/01/2014 12:… 01/01/2006 … Misc. T… Arrival Domestic
## 4 05/01/2014 12:… 01/01/2006 … Misc. T… Departure Domestic
## 5 05/01/2014 12:… 01/01/2006 … Termina… Arrival Domestic
## 6 05/01/2014 12:… 01/01/2006 … Termina… Departure Domestic
## 7 05/01/2014 12:… 01/01/2006 … Termina… Departure International
## 8 05/01/2014 12:… 01/01/2006 … Termina… Arrival Domestic
## 9 05/01/2014 12:… 01/01/2006 … Termina… Arrival International
## 10 05/01/2014 12:… 01/01/2006 … Termina… Departure Domestic
## # … with 5,860 more rows, and 1 more variable: Passenger_Count <dbl>
View the stucture of the data to know the data types of the variables.
str(LA_Airport_Source)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 5870 obs. of 6 variables:
## $ DataExtractDate : chr "05/01/2014 12:00:00 AM" "05/01/2014 12:00:00 AM" "05/01/2014 12:00:00 AM" "05/01/2014 12:00:00 AM" ...
## $ ReportPeriod : chr "01/01/2006 12:00:00 AM" "01/01/2006 12:00:00 AM" "01/01/2006 12:00:00 AM" "01/01/2006 12:00:00 AM" ...
## $ Terminal : chr "Imperial Terminal" "Imperial Terminal" "Misc. Terminal" "Misc. Terminal" ...
## $ Arrival_Departure : chr "Arrival" "Departure" "Arrival" "Departure" ...
## $ Domestic_International: chr "Domestic" "Domestic" "Domestic" "Domestic" ...
## $ Passenger_Count : num 490 498 753 688 401535 ...
## - attr(*, "spec")=
## .. cols(
## .. DataExtractDate = col_character(),
## .. ReportPeriod = col_character(),
## .. Terminal = col_character(),
## .. Arrival_Departure = col_character(),
## .. Domestic_International = col_character(),
## .. Passenger_Count = col_double()
## .. )
View the first and last 10 records to understand the data…
print.data.frame(head(LA_Airport_Source,10))
## DataExtractDate ReportPeriod Terminal
## 1 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Imperial Terminal
## 2 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Imperial Terminal
## 3 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Misc. Terminal
## 4 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Misc. Terminal
## 5 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Terminal 1
## 6 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Terminal 1
## 7 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Terminal 1
## 8 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Terminal 2
## 9 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Terminal 2
## 10 05/01/2014 12:00:00 AM 01/01/2006 12:00:00 AM Terminal 2
## Arrival_Departure Domestic_International Passenger_Count
## 1 Arrival Domestic 490
## 2 Departure Domestic 498
## 3 Arrival Domestic 753
## 4 Departure Domestic 688
## 5 Arrival Domestic 401535
## 6 Departure Domestic 389745
## 7 Departure International 561
## 8 Arrival Domestic 98991
## 9 Arrival International 163067
## 10 Departure Domestic 93672
print.data.frame(tail(LA_Airport_Source,10))
## DataExtractDate ReportPeriod
## 1 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 2 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 3 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 4 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 5 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 6 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 7 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 8 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 9 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## 10 05/15/2019 08:00:42 AM 03/01/2019 12:00:00 AM
## Terminal Arrival_Departure
## 1 Terminal 7 Departure
## 2 Terminal 7 Departure
## 3 Terminal 8 Arrival
## 4 Terminal 8 Arrival
## 5 Terminal 8 Departure
## 6 Terminal 8 Departure
## 7 Tom Bradley International Terminal Arrival
## 8 Tom Bradley International Terminal Arrival
## 9 Tom Bradley International Terminal Departure
## 10 Tom Bradley International Terminal Departure
## Domestic_International Passenger_Count
## 1 Domestic 367998
## 2 International 40493
## 3 Domestic 144933
## 4 International 2132
## 5 Domestic 141931
## 6 International 5225
## 7 Domestic 5066
## 8 International 687288
## 9 Domestic 24976
## 10 International 657435
Source file contains data from January 2006 to March 2019. For every terminal, we have passenger counts for arrival or departure and whether it is domestic or international.
“DataExtractDate” and “ReportPeriod” are stored as characters instead of a date time variable. Also the time indicated in the variable ‘Reportperiod’ is always defaulted to ‘12:00:00 AM’ and therefore I will not be able to find the busy hours at these terminals. I will have to remove it and maybe split the ‘ReportPeriod’ date into month and year to better understand the passenger traffic and find any patters if possible. I might also have to remove ‘DataExtractData’ as it does not help in finding out the busiest terminal.
To undertsand the data in the next three columns - “Terminal”,“Arrival_Departure” and “Domestic_International”, I got the unique list from these columns.
unique(LA_Airport_Source$Terminal)
## [1] "Imperial Terminal"
## [2] "Misc. Terminal"
## [3] "Terminal 1"
## [4] "Terminal 2"
## [5] "Terminal 3"
## [6] "Terminal 4"
## [7] "Terminal 5"
## [8] "Terminal 6"
## [9] "Terminal 7"
## [10] "Terminal 8"
## [11] "Tom Bradley International Terminal"
unique(LA_Airport_Source$Arrival_Departure)
## [1] "Arrival" "Departure"
unique(LA_Airport_Source$Domestic_International)
## [1] "Domestic" "International"
The above three variables are categorical and hence it would be better to change the datatype to factor for all three of them.Moreover it would be best to seperate arrivals,departures,domestic and international variables into columns.
Now lets check the range of passenger count, just to have an idea of number of passengers.
range(LA_Airport_Source$Passenger_Count)
## [1] 1 889348
Summary of the source data
summary(LA_Airport_Source)
## DataExtractDate ReportPeriod Terminal
## Length:5870 Length:5870 Length:5870
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Arrival_Departure Domestic_International Passenger_Count
## Length:5870 Length:5870 Min. : 1
## Class :character Class :character 1st Qu.: 19674
## Mode :character Mode :character Median :105149
## Mean :154972
## 3rd Qu.:270988
## Max. :889348
LA_Airport_Modified <- LA_Airport_Source %>% select(ReportPeriod,Terminal,Arrival_Departure,Domestic_International,Passenger_Count)
LA_Airport_Modified
## # A tibble: 5,870 x 5
## ReportPeriod Terminal Arrival_Departu… Domestic_Intern… Passenger_Count
## <chr> <chr> <chr> <chr> <dbl>
## 1 01/01/2006 1… Imperia… Arrival Domestic 490
## 2 01/01/2006 1… Imperia… Departure Domestic 498
## 3 01/01/2006 1… Misc. T… Arrival Domestic 753
## 4 01/01/2006 1… Misc. T… Departure Domestic 688
## 5 01/01/2006 1… Termina… Arrival Domestic 401535
## 6 01/01/2006 1… Termina… Departure Domestic 389745
## 7 01/01/2006 1… Termina… Departure International 561
## 8 01/01/2006 1… Termina… Arrival Domestic 98991
## 9 01/01/2006 1… Termina… Arrival International 163067
## 10 01/01/2006 1… Termina… Departure Domestic 93672
## # … with 5,860 more rows
LA_Airport_Modified$ReportPeriod <- as.Date(LA_Airport_Modified$ReportPeriod, "%m/%d/%Y")
LA_Airport_Modified <- LA_Airport_Modified %>%
mutate(Year = format(ReportPeriod, "%Y"),Month = format(ReportPeriod, "%m"))
LA_Airport_Modified$Year <- as.numeric(LA_Airport_Modified$Year)
LA_Airport_Modified$Month <- as.numeric(LA_Airport_Modified$Month)
LA_Airport_Modified
## # A tibble: 5,870 x 7
## ReportPeriod Terminal Arrival_Departu… Domestic_Intern… Passenger_Count
## <date> <chr> <chr> <chr> <dbl>
## 1 2006-01-01 Imperia… Arrival Domestic 490
## 2 2006-01-01 Imperia… Departure Domestic 498
## 3 2006-01-01 Misc. T… Arrival Domestic 753
## 4 2006-01-01 Misc. T… Departure Domestic 688
## 5 2006-01-01 Termina… Arrival Domestic 401535
## 6 2006-01-01 Termina… Departure Domestic 389745
## 7 2006-01-01 Termina… Departure International 561
## 8 2006-01-01 Termina… Arrival Domestic 98991
## 9 2006-01-01 Termina… Arrival International 163067
## 10 2006-01-01 Termina… Departure Domestic 93672
## # … with 5,860 more rows, and 2 more variables: Year <dbl>, Month <dbl>
LA_Airport_Mod <- LA_Airport_Modified %>%
filter(Year != 2019)
LA_Airport_Mod
## # A tibble: 5,755 x 7
## ReportPeriod Terminal Arrival_Departu… Domestic_Intern… Passenger_Count
## <date> <chr> <chr> <chr> <dbl>
## 1 2006-01-01 Imperia… Arrival Domestic 490
## 2 2006-01-01 Imperia… Departure Domestic 498
## 3 2006-01-01 Misc. T… Arrival Domestic 753
## 4 2006-01-01 Misc. T… Departure Domestic 688
## 5 2006-01-01 Termina… Arrival Domestic 401535
## 6 2006-01-01 Termina… Departure Domestic 389745
## 7 2006-01-01 Termina… Departure International 561
## 8 2006-01-01 Termina… Arrival Domestic 98991
## 9 2006-01-01 Termina… Arrival International 163067
## 10 2006-01-01 Termina… Departure Domestic 93672
## # … with 5,745 more rows, and 2 more variables: Year <dbl>, Month <dbl>
str(LA_Airport_Mod)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 5755 obs. of 7 variables:
## $ ReportPeriod : Date, format: "2006-01-01" "2006-01-01" ...
## $ Terminal : chr "Imperial Terminal" "Imperial Terminal" "Misc. Terminal" "Misc. Terminal" ...
## $ Arrival_Departure : chr "Arrival" "Departure" "Arrival" "Departure" ...
## $ Domestic_International: chr "Domestic" "Domestic" "Domestic" "Domestic" ...
## $ Passenger_Count : num 490 498 753 688 401535 ...
## $ Year : num 2006 2006 2006 2006 2006 ...
## $ Month : num 1 1 1 1 1 1 1 1 1 1 ...
LA_Airport_Mod$Terminal <- gsub("\\s+","_",LA_Airport_Mod$Terminal)
LA_Airport_Mod$Terminal <- as.factor(LA_Airport_Mod$Terminal)
LA_Airport_Mod$Arrival_Departure <- as.factor(LA_Airport_Mod$Arrival_Departure)
LA_Airport_Mod$Domestic_International <- as.factor(LA_Airport_Mod$Domestic_International)
summary(LA_Airport_Mod)
## ReportPeriod Terminal Arrival_Departure
## Min. :2006-01-01 Terminal_2: 624 Arrival :2835
## 1st Qu.:2009-04-01 Terminal_5: 624 Departure:2920
## Median :2012-07-01 Terminal_7: 624
## Mean :2012-06-28 Terminal_4: 623
## 3rd Qu.:2015-10-01 Terminal_6: 621
## Max. :2018-12-01 Terminal_8: 600
## (Other) :2039
## Domestic_International Passenger_Count Year Month
## Domestic :3116 Min. : 1 Min. :2006 Min. : 1.000
## International:2639 1st Qu.: 19423 1st Qu.:2009 1st Qu.: 3.000
## Median :105012 Median :2012 Median : 6.000
## Mean :154558 Mean :2012 Mean : 6.489
## 3rd Qu.:270620 3rd Qu.:2015 3rd Qu.: 9.000
## Max. :889348 Max. :2018 Max. :12.000
##
Yearwise_Data <- LA_Airport_Mod %>%
select(Terminal,Year,Passenger_Count) %>%
group_by(Terminal,Year) %>%
summarize(Passenger_sum = sum(Passenger_Count))
Yearwise_Data
## # A tibble: 141 x 3
## # Groups: Terminal [11]
## Terminal Year Passenger_sum
## <fct> <dbl> <dbl>
## 1 Imperial_Terminal 2006 5579
## 2 Imperial_Terminal 2007 2432
## 3 Imperial_Terminal 2008 3985
## 4 Imperial_Terminal 2009 530
## 5 Imperial_Terminal 2010 1428
## 6 Imperial_Terminal 2011 1497
## 7 Imperial_Terminal 2012 1047
## 8 Imperial_Terminal 2013 434
## 9 Imperial_Terminal 2014 358
## 10 Imperial_Terminal 2015 1167
## # … with 131 more rows
busiest_terminal <- Yearwise_Data%>%
filter(Passenger_sum == max(Yearwise_Data$Passenger_sum))
busiest_terminal
## # A tibble: 1 x 3
## # Groups: Terminal [1]
## Terminal Year Passenger_sum
## <fct> <dbl> <dbl>
## 1 Tom_Bradley_International_Terminal 2018 17777848
Domestic_International_Data_perYear <- LA_Airport_Mod %>%
select(Domestic_International,Year,Passenger_Count) %>%
group_by(Domestic_International,Year) %>%
summarize(Passenger_sum = sum(Passenger_Count))
Domestic_International_Data_perYear
## # A tibble: 26 x 3
## # Groups: Domestic_International [2]
## Domestic_International Year Passenger_sum
## <fct> <dbl> <dbl>
## 1 Domestic 2006 44129926
## 2 Domestic 2007 45190615
## 3 Domestic 2008 43137056
## 4 Domestic 2009 41419913
## 5 Domestic 2010 43134145
## 6 Domestic 2011 45130728
## 7 Domestic 2012 46535207
## 8 Domestic 2013 48815518
## 9 Domestic 2014 51560202
## 10 Domestic 2015 53981347
## # … with 16 more rows
Domestic_International_Data_Terminal_Year <- LA_Airport_Mod %>%
select(Terminal,Domestic_International,Year,Passenger_Count) %>%
group_by(Terminal,Domestic_International,Year) %>%
summarize(Passenger_sum = sum(Passenger_Count))
Domestic_International_Data_Terminal_Year
## # A tibble: 267 x 4
## # Groups: Terminal, Domestic_International [22]
## Terminal Domestic_International Year Passenger_sum
## <fct> <fct> <dbl> <dbl>
## 1 Imperial_Terminal Domestic 2006 5480
## 2 Imperial_Terminal Domestic 2007 2297
## 3 Imperial_Terminal Domestic 2008 3960
## 4 Imperial_Terminal Domestic 2009 49
## 5 Imperial_Terminal Domestic 2010 1428
## 6 Imperial_Terminal Domestic 2011 1497
## 7 Imperial_Terminal Domestic 2012 817
## 8 Imperial_Terminal Domestic 2013 434
## 9 Imperial_Terminal Domestic 2014 239
## 10 Imperial_Terminal Domestic 2015 1167
## # … with 257 more rows
Domestic_Data_Terminal <- LA_Airport_Mod %>%
select(Terminal,Domestic_International,Year,Passenger_Count) %>%
filter(Domestic_International == "Domestic") %>%
group_by(Terminal,Year) %>%
summarize(Passenger_sum = sum(Passenger_Count))
Domestic_Data_Terminal
## # A tibble: 140 x 3
## # Groups: Terminal [11]
## Terminal Year Passenger_sum
## <fct> <dbl> <dbl>
## 1 Imperial_Terminal 2006 5480
## 2 Imperial_Terminal 2007 2297
## 3 Imperial_Terminal 2008 3960
## 4 Imperial_Terminal 2009 49
## 5 Imperial_Terminal 2010 1428
## 6 Imperial_Terminal 2011 1497
## 7 Imperial_Terminal 2012 817
## 8 Imperial_Terminal 2013 434
## 9 Imperial_Terminal 2014 239
## 10 Imperial_Terminal 2015 1167
## # … with 130 more rows
International_Data_Terminal <- LA_Airport_Mod %>%
select(Terminal,Domestic_International,Year,Passenger_Count) %>%
filter(Domestic_International == "International") %>%
group_by(Terminal,Year) %>%
summarize(Passenger_sum = sum(Passenger_Count))
International_Data_Terminal
## # A tibble: 127 x 3
## # Groups: Terminal [11]
## Terminal Year Passenger_sum
## <fct> <dbl> <dbl>
## 1 Imperial_Terminal 2006 99
## 2 Imperial_Terminal 2007 135
## 3 Imperial_Terminal 2008 25
## 4 Imperial_Terminal 2009 481
## 5 Imperial_Terminal 2012 230
## 6 Imperial_Terminal 2014 119
## 7 Misc._Terminal 2006 7
## 8 Misc._Terminal 2007 19
## 9 Misc._Terminal 2008 155
## 10 Misc._Terminal 2009 876
## # … with 117 more rows
library(ggplot2)
library(RColorBrewer)
library(gganimate)
library(ggthemes)
plot1<- ggplot(Yearwise_Data, aes(x = Year, y = Passenger_sum, color = Terminal)) +
geom_line() +
geom_point() +
xlab("Year") +
ylab("Passenger Count") +
scale_x_continuous(breaks=c(2006,2008,2010,2012,2014,2016,2018)) +
scale_y_log10() +
transition_reveal(Year) +
scale_color_brewer(palette = "Set3") +
ggtitle("LA Passenger Traffic per terminal per year")+
theme_grey() +
theme(axis.text.x = element_text(angle= 80,hjust=1)) +
enter_fade() +
exit_fade()
plot1
plot2 <- ggplot(Domestic_International_Data_Terminal_Year, aes(x=Terminal, y=Passenger_sum, frame= Year)) +
geom_bar(stat = "identity", position = "stack", aes(fill = Domestic_International)) +
scale_y_continuous() +
labs( x = 'Terminal', y = 'Passenger Count', title = 'Domestic vs.International Passenger Traffic', caption = "based on data from Data.gov", subtitle = 'Year:{as.integer(frame_time)}')+
theme(axis.text.x = element_text(angle= 80,hjust=1))
anim1 <- plot2 + transition_time(as.integer(Year))
anim1
plot3 <- ggplot(Domestic_International_Data_perYear, aes(x = Year, y = Passenger_sum, color = Domestic_International)) +
geom_line()
plot3